* Reset settings and initialize log file
launch, path("build/cps_bms_builder")

*-------------------------------------------------------------------------------
* Price and Wasserman (2024), "The Summer Drop in Female Employment"
*
* Description: Assemble and process CPS basic monthly files.
*-------------------------------------------------------------------------------


* Prepare value labels for state FIPS codes
*-------------------------------------------------------------------------------

use "$basepath/data/derived/state_fips.dta", clear
labmask state_fips, values(state_name) lblname(state_fips_lbl)
tempfile state_fips_lbl
label save state_fips_lbl using `state_fips_lbl'


* Prepare PCE deflator
*-------------------------------------------------------------------------------

* Load the PCE price index
freduse PCEPI, clear
rename PCEPI pce

* Format date
gen int year = real(substr(date, 1, 4))
gen byte month = real(substr(date, 6, 2))
gen int tm = ym(year, month)
format tm %tm

* Restrict to our analysis period (including the previous year)
keep if inrange(year, 1988, 2019)

* Reindex price deflator to Dec. 2019
sort tm
recast double pce
replace pce = pce/pce[_N]
assert pce == 1 if _n == _N
keep tm pce

* Label variables
label variable tm  "Year/month"
label variable pce "PCE index"

* Save to disk
compress
save "$basepath/data/derived/pce.dta", replace


* Construct dataset of children
*-------------------------------------------------------------------------------

* Load select data on residents under 18
gzuse cpsid cpsidp pernum year month sex age momloc poploc momloc2 poploc2 if age < 18 using "$basepath/data/raw/cps/bms/$cps_bms.dta.gz", clear

* Verify the analysis period
quietly sum year
assert r(min) == 1989 & r(max) == 2019

* Rename household/person identifiers
rename cpsid hid
rename cpsidp pid

* Extract calendar year/month
gen int tm = ym(year, month)
format tm %tm

* Verify data structure
gisid hid pernum tm

* Recode sex
gen byte female = (sex == 2) if inlist(sex, 1, 2)
label define female_lbl 0 "Male", replace
label define female_lbl 1 "Female", add
label values female female_lbl

* Restrict to desired variables
keep hid pid pernum tm female age momloc poploc momloc2 poploc2
order hid pid pernum tm female age momloc poploc momloc2 poploc2

* Label variables
label variable hid      "CPS ID, household record"
label variable pid      "CPS ID, person record"
label variable pernum   "Person number in sample unit"
label variable tm       "Year/month"
label variable female   "Female"
label variable age      "Age"
label variable momloc   "Person number of first mother"
label variable poploc   "Person number of first father"
label variable momloc2  "Person number of second mother"
label variable poploc2  "Person number of second father"

* Drop extraneous value label
label values age

* Save the data
sort hid pernum tm
compress
gzsave "$basepath/data/derived/cps_bms_children.dta.gz", replace


* Construct dataset of adults
*-------------------------------------------------------------------------------

* Load desired variables
#delimit ;
gzuse
	cpsid cpsidp marbasecidp year month pernum mish wtfinl lnkfw1mwt
	sex age educ marst statefip race hispan sploc
	empstat classwkr wkstat occ1990 ind1990 uhrsworkt ahrsworkt
	whyunemp whyabsnt nilfact schlcoll
	earnwt earnweek paidhour hourwage uhrsworkorg uh_payabs_b2
	using "$basepath/data/raw/cps/bms/$cps_bms.dta.gz", clear;
#delimit cr

* Verify the analysis period
quietly sum year
assert r(min) == 1989 & r(max) == 2019

* Drop individuals not in the civilian employment universe
drop if inlist(empstat, 0, 1)

* Drop any remaining individuals under age 15
drop if age < 15

* Verify that everyone has valid personal and household identifiers
assert !missing(cpsidp) & cpsidp != 0
assert !missing(cpsid)  & cpsid  != 0


* Process technical variables
*-------------------------------------------------------------------------------

* Rename household/person identifiers
rename cpsid hid
rename cpsidp pid

* Extract calendar year/month
gen int tm = ym(year, month)
format tm %tm

* Verify data structure
gisid pid tm

* Process month-in-survey
label values mish

* Compress weights to save storage space
recast float wtfinl, force
recast float lnkfw1mwt, force
recast float earnwt, force

* Set panel structure
tsset pid tm

* Specify technical variables
local idvars "pid hid tm pernum mish wtfinl lnkfw1mwt marbasecidp"


* Process demographic variables
*-------------------------------------------------------------------------------

* Recode sex
assert inlist(sex, 1, 2)
gen byte female = (sex == 2)
label define female_lbl 0 "Male", replace
label define female_lbl 1 "Female", add
label values female female_lbl

* Topcode age at 80 (used as a topcode starting in 2002)
assert inrange(age, 0, 99)
recode age (80/99 = 80)
label values age age_lbl

forvalues k = 0/80 {
	if `k' == 0 {
		label define age_lbl `k' "Under 1 year", replace
	}
	else if `k' > 0 & `k' < 80 {
		label define age_lbl `k' "`k'", add
	}
	else if `k' == 80 {
		label define age_lbl `k' "80 or above", add
	}
}

* Recode race and ethnicity
recode race (999 = .) (100 = 1) (200 = 2) (300/830 = 4), gen(wbho)
replace wbho = 3 if inrange(hispan, 100, 500)
label define wbho_lbl 1 "White non-Hispanic", replace
label define wbho_lbl 2 "Black non-Hispanic", add
label define wbho_lbl 3 "Hispanic or Latinx", add
label define wbho_lbl 4 "Other non-Hispanic", add
label values wbho wbho_lbl

* Distinguish four levels of educational attainment
recode educ (0/1 999 = .) (2/60 = 1) (70/73 = 2) (80/100 = 3) (110/125 = 4)
label define educ_lbl 1 "Less than high school", replace
label define educ_lbl 2 "High school degree", add
label define educ_lbl 3 "Some college", add
label define educ_lbl 4 "College degree or higher", add
label values educ educ_lbl

* Recode marital status
gen byte marstat = marst if marst != 9
label define marstat_lbl 1 "Married, spouse present", replace
label define marstat_lbl 2 "Married, spouse absent", add
label define marstat_lbl 3 "Separated", add
label define marstat_lbl 4 "Divorced", add
label define marstat_lbl 5 "Widowed", add
label define marstat_lbl 6 "Single", add
label define marstat_lbl 7 "Widowed or divorced", add
label values marstat marstat_lbl

* Incorporate state FIPS codes
rename statefip state_fips
assert inrange(state_fips, 1, 56)
quietly do `state_fips_lbl'
label values state_fips state_fips_lbl

* Recode enrollment in school or college
gen byte school_status = .
replace school_status = 1 if inlist(schlcoll, 1, 2)
replace school_status = 2 if inlist(schlcoll, 3, 4)
replace school_status = 3 if schlcoll == 5

label define school_status_lbl 1 "In high school (full-time or part-time)", replace
label define school_status_lbl 2 "In college/university (full-time or part-time)", add
label define school_status_lbl 3 "Not attending school, college, or university", add
label values school_status school_status_lbl

* Specify demographic variables
local demvars "female age wbho educ marstat sploc state_fips school_status"


* Validate longitudinal links (adapting Madrian and Lefgren, 2000)
*-------------------------------------------------------------------------------

* Store a copy of the data in memory
tempfile core
save `core'

* Retain only the variables needed to validate links
keep pid tm mish sex wbho age

* Validate longitudinal links on a month-to-month basis
tempvar sex_mismatch wbho_mismatch age_mismatch sum_mismatches
gen byte `sex_mismatch' = (sex != L.sex) if tm == L.tm + 1
gen byte `wbho_mismatch' = (wbho != L.wbho) if tm == L.tm + 1
gen byte `age_mismatch' = abs(age - L.age) > 2 if tm == L.tm + 1
gen byte linked_monthly = (`sex_mismatch' == 0 & `wbho_mismatch' == 0 & `age_mismatch' == 0)
drop `sex_mismatch' `wbho_mismatch' `age_mismatch'

* Validate longitudinal links for all observations
tempvar sex_mismatch wbho_mismatch age_mismatch sum_mismatches
bysort pid (tm): gen byte `sex_mismatch' = (sex != sex[1])
bysort pid (tm): gen byte `wbho_mismatch' = (wbho != wbho[1])
bysort pid (age): gen byte `age_mismatch' = (age > age[1] + 2)
bysort pid (tm): gen byte `sum_mismatches' = sum(`sex_mismatch' + `wbho_mismatch' + `age_mismatch')
bysort pid (tm): gen byte linked_complete = (`sum_mismatches'[_N] == 0)
drop `sex_mismatch' `wbho_mismatch' `age_mismatch' `sum_mismatches'

* Verify that those linkable across all observations are linkable month-to-month
assert linked_complete <= linked_monthly if tm == L.tm + 1

* Store linkability variables for later merging
local linkvars "linked_monthly linked_complete"
keep pid tm `linkvars'
tempfile linkability
save `linkability'

* Reload dataset
use `core', clear


* Process labor market variables
*-------------------------------------------------------------------------------

* Extract employment, unemployment, and non-participation
gen byte emp   = inlist(empstat, 10, 12)
gen byte unemp = inlist(empstat, 20, 21, 22)
gen byte nlf   = inlist(empstat, 30, 31, 32, 33, 34, 35, 36)
assert emp + unemp + nlf == 1

* Extract part-time/full-time/not-at-work status (correcting one anomaly)
gen byte ftemp  = (empstat != 12) & inlist(wkstat, 10, 11, 14, 15)
gen byte ptemp  = (empstat != 12) & inlist(wkstat, 12, 20, 21, 22, 40, 41)
gen byte absent = (empstat == 12)
replace ptemp = 1 if pid == 19940501210002 & tm == tm(1994m7)
assert absent == 1 if inlist(wkstat, 13, 42)
assert ftemp + ptemp + absent == emp

* Record whether an individual was absent with pay
gen byte paid_absence = (uh_payabs_b2 == 1) if absent == 1 & year >= 1994

* Tag self-employed workers
assert !missing(classwkr) & !inlist(classwkr, 0, 99) if emp == 1
gen byte selfemp = inlist(classwkr, 10, 13, 14) if emp == 1
assert missing(selfemp) == (emp == 0)

* Recode as missing if values are not in the universe (niu) or didn't answer
recode whyunemp (0 = .)
recode whyabsnt (0 = .)

* Retain IPUMS value labels for select variables
label copy EMPSTAT empstat_lbl
label copy WKSTAT wkstat_lbl
label copy WHYUNEMP whyunemp_lbl
label copy WHYABSNT whyabsnt_lbl
label values empstat empstat_lbl
label values wkstat wkstat_lbl
label values whyunemp whyunemp_lbl
label values whyabsnt whyabsnt_lbl

* Process usual hours worked
replace uhrsworkt = . if inlist(uhrsworkt, 997, 999)

* Process actual hours worked last week
replace ahrsworkt = . if ahrsworkt == 999
replace ahrsworkt = min(ahrsworkt, 168) if !missing(ahrsworkt)

* Recode actual hours worked as zero for non-participants with <15 unpaid hours
assert empstat == 35 if emp == 0 & !missing(ahrsworkt)
replace ahrsworkt = . if emp == 0

* Verify that actual hours worked are consistent with presence/absence from work
assert missing(ahrsworkt) == (absent == 1) if emp == 1

* Extract major activity while out of the labor force (available 1994 onward)
assert inlist(empstat, 32, 34, 36) if year >= 1994 & nlf == 1
gen byte whynilf = .
replace whynilf = 1 if year >= 1994 & empstat == 36
replace whynilf = 2 if year >= 1994 & (empstat == 32 | (empstat == 34 & inlist(nilfact, 1, 2)))
replace whynilf = 3 if year >= 1994 & empstat == 34 & nilfact == 3
replace whynilf = 4 if year >= 1994 & empstat == 34 & nilfact == 4
replace whynilf = 5 if year >= 1994 & empstat == 34 & nilfact == 6
replace whynilf = .a if year >= 1994 & empstat == 34 & nilfact == 99
assert whynilf != . if year >= 1994 & nlf == 1

label define whynilf_lbl 1 "Retired", replace
label define whynilf_lbl 2 "Disabled, ill, or unable to work", add
label define whynilf_lbl 3 "In school", add
label define whynilf_lbl 4 "Taking care of house or family", add
label define whynilf_lbl 5 "Something else/other", add
label define whynilf_lbl .a "Unknown (1994 or later)", add
label values whynilf whynilf_lbl

* Map 3-digit occupation codes into 2-digit codes
quietly do "$basepath/code/build/create_occ1990_2d.do"

* Retain IPUMS value labels for select variables
label copy UHRSWORKT uhrsworkt_lbl
label copy OCC1990 occ1990_lbl
label copy IND1990 ind1990_lbl
label values uhrsworkt uhrsworkt_lbl
label values occ1990 occ1990_lbl
label values ind1990 ind1990_lbl

* Specify employment variables
local empvars "emp unemp nlf ftemp ptemp absent empstat wkstat paid_absence selfemp occ1990 ind1990 occ1990_2d uhrsworkt ahrsworkt whyunemp whynilf whyabsnt"


* Process weekly earnings
*-------------------------------------------------------------------------------

* Replace NIU with missing
replace earnweek = . if earnweek == 9999.99

* Note the topcode for weekly earnings
gen topcode = .
replace topcode = 1923    if inrange(year, 1989, 1997)
replace topcode = 2884.61 if inrange(year, 1998, .)
assert earnweek > 0 & earnweek <= topcode if !missing(earnweek)

* Apply a constant adjustment factor to topcoded earnings
replace earnweek = 1.5 * earnweek if earnweek == topcode
drop topcode

* Inflate to constant dollars (note: weekly earnings pertain to the survey year)
merge m:1 tm using "$basepath/data/derived/pce.dta", assert(2 3) keep(3) nogenerate
replace earnweek = earnweek/pce


* Process hourly wages
*-------------------------------------------------------------------------------

* Tag workers who are paid by the hour
gen hourly = .
replace hourly = 0 if paidhour == 1
replace hourly = 1 if paidhour == 2
label define hourly_lbl 0 "Not paid by the hour", replace
label define hourly_lbl 1 "Paid by the hour", add
label values hourly hourly_lbl

* Replace NIU with missing
replace hourwage = . if hourwage == 999.99
replace uhrsworkorg = . if inlist(uhrsworkorg, 998, 999)

* Note the topcode for hourly earnings, which varies by year and by usual hours worked
* (see https://cps.ipums.org/cps/hourly_earnings_topcodes.shtml)
gen topcode = .
replace topcode = 99.99 if inrange(year, 1989, 2002) & !missing(hourwage, uhrsworkorg) & uhrsworkorg < 20
replace topcode = (1923.07/uhrsworkorg) if inrange(year, 1989, 2002) & !missing(hourwage, uhrsworkorg) & uhrsworkorg >= 20
replace topcode = 99.99 if inrange(year, 2003, .) & !missing(hourwage, uhrsworkorg) & uhrsworkorg < 29
replace topcode = (2885.07/uhrsworkorg) if inrange(year, 2003, .) & !missing(hourwage, uhrsworkorg) & uhrsworkorg >= 29

* Impose the topcode on a modest number of observations that violate the topcode
count if hourwage > topcode & !missing(hourwage, topcode) & inrange(year, 1989, 2002) & uhrsworkorg >= 20
replace hourwage = topcode if hourwage > topcode & !missing(hourwage, topcode) & inrange(year, 1989, 2002) & uhrsworkorg >= 20

* Verify that the topcode now holds universally (modulo floating-point error)
assert hourwage > 0 & hourwage <= topcode + .00001 if !missing(hourwage, topcode)

* Apply a constant adjustment factor to topcoded earnings
replace hourwage = 1.5 * hourwage if hourwage >= topcode - .00001 & !missing(hourwage, topcode)

* Inflate to constant dollars (note: weekly earnings pertain to the survey year)
replace hourwage = hourwage/pce

* Specify earnings variables
local earnvars "earnweek earnwt hourly hourwage uhrsworkorg"


* Consolidate variables
*-------------------------------------------------------------------------------

* Restrict to desired variables
keep `idvars' `demvars' `empvars' `earnvars'

* Merge in linkability variables
merge 1:1 pid tm using `linkability', assert(3) nogenerate

* Organize variables
order `idvars' `demvars' `linkvars' `empvars' `earnvars'


* Finalize extract
*-------------------------------------------------------------------------------

* Label variables
label variable pid             "CPS ID, person record"
label variable hid             "CPS ID, household record
label variable tm              "Year/month"
label variable pernum          "Person number in sample unit"
label variable mish            "Month in sample, household level"
label variable wtfinl          "Final basic weight"
label variable lnkfw1mwt       "Longitudinal weight for two adjacent months"
label variable marbasecidp     "Unique ID for linking March basic to ASEC"
label variable linked_monthly  "Valid longitudinal link relative to previous month"
label variable linked_complete "Valid longitudinal links across all observations"
label variable female          "Female"
label variable age             "Age"
label variable wbho            "Race and ethnicity"
label variable educ            "Educational attainment"
label variable marstat         "Marital status"
label variable sploc           "Person number of spouse"
label variable state_fips      "State FIPS code"
label variable school_status   "Attendance at high school or college/university"
label variable emp             "Employed"
label variable unemp           "Unemployed"
label variable nlf             "Not in labor force"
label variable ftemp           "Full-time employed last week"
label variable ptemp           "Part-time employed last week"
label variable absent          "Employed, not at work last week"
label variable empstat         "Employment status"
label variable wkstat          "Full/part-time status"
label variable paid_absence    "Paid for time off last week (if absent)"
label variable selfemp         "Indicator for being self-employed (populated only for the employed)"
label variable occ1990         "3-digit occupation, 1990 basis"
label variable ind1990         "3-digit industry, 1990 basis"
label variable occ1990_2d      "2-digit occupation, 1990 basis"
label variable uhrsworkt       "Hours usually worked per week"
label variable ahrsworkt       "Hours actually worked last week"
label variable whyunemp        "Reason for being unemployed"
label variable whynilf         "Reason for being (or major activity while) out of the labor force"
label variable whyabsnt        "Reason for not being at work"
label variable earnweek        "Weekly earnings, Dec. 2019 $"
label variable earnwt          "Earnings weight"
label variable hourly          "Paid by the hour"
label variable hourwage        "Hourly wage, Dec. 2019 $"
label variable uhrsworkorg     "Usual hours worked, ORG"

* Save the cleaned extract
sort pid tm
compress
gzsave "$basepath/data/derived/cps_bms.dta.gz", replace

* Close the log file
unlaunch
